# Step 2: GPU Acceleration

# --- Move Model to GPU ---
model.to(DEVICE)

# --- Create Single Input (Conceptual) ---
# Assuming an image tensor (e.g., 3 channels, 224x224 size)
BATCH_SIZE_UNBATCHED = 1
INPUT_TENSOR_SINGLE = torch.randn(BATCH_SIZE_UNBATCHED, 3, 224, 224)

# --- Move Data to GPU ---
INPUT_TENSOR_SINGLE = INPUT_TENSOR_SINGLE.to(DEVICE)

# --- Inference (Baseline Speed) ---
with torch.no_grad():
    start_time = time.time()
    output_single = model(INPUT_TENSOR_SINGLE)
    end_time = time.time()
    time_single = end_time - start_time
    print(f"\nInference Time (Single, GPU): {time_single:.4f} seconds")

# ----------------------------------------------------------------------

# KEY CONCEPT: By moving computation to the 'cuda' device, we leverage the
# massive parallelism of the GPU.